---
title: "MicroMic project dada2 Run2"
output: html_document
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

Cutadapt 

```{r cutadapt}

# Charlie Pauvert, 2018-10-11
# Using the tutorial at http://benjjneb.github.io/dada2/ITS_workflow.html


library(dada2)
library(ShortRead)
library(Biostrings)

#########################
#                       #
#   Data preparation    #
#                       #
#########################

# Path to the sequences
path<-"/seqs"
# Path of the output
outp<-"/data"

# List FASTQ files Forward and Reverse
# In this dataset, the sequences with the forward primer are the rev files with the 2:N:0:0 in the FASTQ header
fqF <-sort(list.files(path, pattern="*_rev.fastq"))
fqR <-sort(list.files(path, pattern="*_fwd.fastq"))

# Extract sample names
sample.names <- gsub("_rev.fastq","",fqF)

# Add the full path to the sequences
fqF <- file.path(path, fqF)
fqR <- file.path(path, fqR)

#########################
#                       #
#    Primers checks     #
#                       #
#########################

# Identify the primers sequences, in 5'-3' orientation
FWD <- "AACMGGATTAGATACCCKG"  # 779F 
REV <- "AGGGTTGCGCTCGTTG"    # 1115R 

# Function to reorient the primers
allOrients <- function(primer) {
  # Create all orientations of the input sequence
  require(Biostrings)
  dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
  orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
               RevComp = reverseComplement(dna))
  return(sapply(orients, toString))  # Convert back to character vector
}

FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)

# Visual check
FWD.orients

# Count the occurrences of the primers sequences in a random sample
sample_to_check<-sample(seq_len(length(fqF)),1)

# Remove the ambuiguous nucletotides from these sequences
# fnFs.filtN <- file.path("filtN", basename(fqF)) # Put N-filtered files in filtN/ subdirectory
# fnRs.filtN <- file.path("filtN", basename(fqR))
# IF starting from files that were not compressed (not .gz) add it
# manually to the future name of cutadapt filtered files
fnFs.filtN <- file.path("filtN", paste0(basename(fqF),".gz")) # Put N-filtered files in filtN/ subdirectory
fnRs.filtN <- file.path("filtN", paste0(basename(fqR), ".gz"))

removeNs <- filterAndTrim(fqF, fnFs.filtN, fqR, fnRs.filtN, maxN = 0)

# Matrix with raw reads number and number of ambiguous nucleotides
colnames(removeNs)<-c("raw","no.Ns")

primerHits <- function(primer, fn) {
  # Counts number of reads in which the primer is found
  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
  return(sum(nhits > 0))
}

#Check the number of primers found in the sequences for one random sample
primer.check<-rbind(
  FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[sample_to_check]), 
  FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[sample_to_check]), 
  REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[sample_to_check]), 
  REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[sample_to_check])
)
write.table(primer.check, file.path(outp, "primer.check.txt"))

#########################
#                       #
#    Primers removal    #
#                       #
#########################


# Remove the primers
cutadapt<-"/usr/local/bin/cutadapt"

path.cut<-"/data"
if(!dir.exists(path.cut)) dir.create(path.cut)

#Prepare the output of cutadapt
fnFs.cut <- file.path(path.cut, basename(fqF))
fnRs.cut <- file.path(path.cut, basename(fqR))

# FWD.RC <- as.character(FWD.orients["RevComp"])
# REV.RC <- as.character(REV.orients["RevComp"])

# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
# R1.flags <- paste("-g", FWD, "-a", REV.RC) 
R1.flags <- paste("-g", FWD) 

# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
# R2.flags <- paste("-G", REV, "-A", FWD.RC)
R2.flags <- paste("-G", REV) 

# Run Cutadapt
library(plyr)
l_ply(seq_along(fqF), function(i){
  system2(cutadapt, args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                             "-o", fnFs.cut[i], "-p", fnRs.cut[i], # output files
                             "-e",0, # No error in the primer sequences,
                             "--discard-untrimmed", # Keep only the sequences with exact primers sequences
                             fnFs.filtN[i], fnRs.filtN[i])) # input files
})

#Check again the number of primers found in the sequences for one random sample
primer.check.cutadapt<-rbind(
  FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[sample_to_check]), 
  FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[sample_to_check]), 
  REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[sample_to_check]), 
  REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[sample_to_check])
)

write.table(primer.check.cutadapt, file.path(outp, "primer.check.cutadapt.txt"))

#########################
#                       #
#    Quality checks     #
#                       #
#########################


# Plot the quality profile of two random samples
quality_to_check<-sample(seq_len(length(fqF)),2)

(pqF<-plotQualityProfile(fnFs.cut[quality_to_check]))
(pqR<-plotQualityProfile(fnRs.cut[quality_to_check]))

# Save the plots
library(ggplot2)
ggsave(file.path(outp,"lb-pqF.png"), pqF, width = 8, height = 6, dpi = 300)
ggsave(file.path(outp,"lb-pqR.png"), pqR, width = 8, height = 6, dpi = 300)
```

# Using the tutorial at http://benjjneb.github.io/dada2/ITS_workflow.html

1 -  Generating an ASV table with DADA2
```{r paths_and_libraries , eval=FALSE}
setwd("~/These/Projects")

# Libraries 
library(dada2)
library(ShortRead)
library(Biostrings)

# Set paths 
path <- "MicroMic/R_code/Pipelines/output/Run2_16S/cutadapt"
outp <-"MicroMic/R_code/Pipelines/output/Run2_16S/output"

# Sample names 
list.files(path)
```

# Quality filters 
```{r sequencing_quality, eval=FALSE }

# Forward and reverse sequences are inverse in our data sets 
# Get the path for each sample 
fnFs <- sort(list.files(path, pattern="rev.fastq", full.names = TRUE))# CHANGE if different file extensions
fnRs <- sort(list.files(path, pattern="fwd.fastq", full.names = TRUE))

fnFs <- list.files(path, pattern="rev.fastq")# CHANGE if different file extensions
fnRs <- list.files(path, pattern="fwd.fastq")

# Filtering
path.filtered <-  "MicroMic/R_code/Pipelines/output/Run2_16S/cutadapt_and_filtered"
out <- filterAndTrim(file.path(path,fnFs), file.path(path.filtered,fnFs),
                     file.path(path,fnRs), file.path(path.filtered,fnRs), 
                     maxN = 0, maxEE = c(2, 2), 
                     truncQ = 2, minLen = 50, rm.phix = TRUE, compress = TRUE,verbose=TRUE)

# Matrix with cutadapted sequences number and quality filter
colnames(out)<-c("cutadapt","quality")
write.csv(out, "MicroMic/R_code/Pipelines/output/Run2_16S/cutadapt_and_filtered/quality.csv")

# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
fnFs <- sort(list.files(path, pattern="_rev.fastq", full.names = TRUE))
filtFs <- list.files(path.filtered, pattern="_rev.fastq", full.names=TRUE)

fnRs <- sort(list.files(path, pattern="_fwd.fastq", full.names = TRUE))
filtRs <- list.files(path.filtered, pattern="_fwd.fastq", full.names=TRUE)

fq <-list.files(path.filtered, pattern="*_rev.fastq")
sample.names <- gsub("_rev.fastq","",fq)
names(filtFs) <- sample.names
names(filtRs) <- sample.names
```

# Learn error 
```{r Learn_error, eval=FALSE }
# In parallel
require(doParallel,quietly = T)
no_cores <- detectCores()-1
cl <- makeCluster(no_cores) 
registerDoParallel(cl)

# Plot errors 
library(ggplot2)
errF <- learnErrors(filtFs, multithread=no_cores)
saveRDS(errF, file.path(outp,"errF.RDS"))
plot_errF <- plotErrors(errF, nominalQ=TRUE)
ggsave(file.path(outp,"plot_errF.png"),plot_errF, width = 8, height = 6, dpi = 300)

errR <- learnErrors(filtRs, multithread=no_cores)
saveRDS(errR, file.path(outp,"errR.RDS"))
plot_errR <-plotErrors(errR, nominalQ=TRUE)
ggsave(file.path(outp,"plot_errR.png"),plot_errR, width = 8, height = 6, dpi = 300)
```

# Dereplication & Denoising
```{r dereplication, eval=FALSE }
# Infer sequence variants

# FORWARD reads
dadaFs <- dada(filtFs, err=errF, multithread=TRUE)
saveRDS(dadaFs, file = file.path(outp,"dadaFs.RDS"), compress = "gzip")

# REVERSE reads
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)
saveRDS(dadaRs, file = file.path(outp,"dadaRs.RDS"), compress = "gzip")
```

# Merge paires
```{r merge_paires, eval=FALSE}
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
saveRDS(mergers, file = file.path(outp,"mergers.RDS"), compress = "gzip")

# Inspect the merger data.frame from the first sample
head(mergers[[1]])
```

# Infer ASVs 
```{r infer_ASV, eval=FALSE}
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
saveRDS(seqtab, file.path(outp,"seqtab.RDS"),compress="gzip")

# Inspect distribution of sequence lengths
table(nchar(getSequences(seqtab)))
```

# Remove chimeras 
```{r remove_chimeras, eval=FALSE}
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=no_cores, verbose=TRUE)
saveRDS(seqtab.nochim, file.path(outp,"seqtab.no.chimera.RDS"),compress="gzip")
dim(seqtab.nochim)
```

# Collapse ASVs  
```{r collapse_ASV, eval=FALSE}
##############################
#                            #
#  Collapse no Mismatch CV   #
#                            #
##############################


collapseNoMismatch_CV <- function (seqtab, minOverlap = 20, orderBy = "abundance", vec = TRUE, 
                                   verbose = FALSE) 
{
  unqs.srt <- sort(getUniques(seqtab), decreasing = TRUE)
  seqs <- names(unqs.srt)
  print(paste(length(seqs), " unique sequences"))
  seqs.out <- character(0)
  collapsed <- matrix(0L, nrow = nrow(seqtab), ncol = ncol(seqtab))
  colnames(collapsed) <- colnames(seqtab)
  rownames(collapsed) <- rownames(seqtab)
  i <- 0
  for (query in seqs) {
    added = FALSE
    prefix <- substr(query, 1, minOverlap)
    i <- i+1
    print(paste(i, " out of ", length(seqs), " unique sequences"))
    for (ref in seqs.out) {
      prefix.ref <- substr(ref, 1, minOverlap)
      if (grepl(prefix, ref, fixed = TRUE) || grepl(prefix.ref, 
                                                    query, fixed = TRUE)) {
        if (nwhamming(query, ref, vec = vec, band = 16) == 
            0) {
          collapsed[, ref] <- collapsed[, ref] + seqtab[, 
                                                        query]
          added = TRUE
          break
        }
      }
    }
    if (!added) {
      collapsed[, query] <- seqtab[, query]
      seqs.out <- c(seqs.out, query)
    }
  }
  if (!identical(unname(colSums(collapsed) > 0), colnames(collapsed) %in% 
                 seqs.out)) {
    stop("Mismatch between output sequences and the collapsed sequence table.")
  }
  collapsed <- collapsed[, colnames(collapsed) %in% seqs.out, 
                         drop = FALSE]
  if (!is.null(orderBy)) {
    if (orderBy == "abundance") {
      collapsed <- collapsed[, order(colSums(collapsed), 
                                     decreasing = TRUE), drop = FALSE]
    }
    else if (orderBy == "nsamples") {
      collapsed <- collapsed[, order(colSums(collapsed > 
                                               0), decreasing = TRUE), drop = FALSE]
    }
  }
  collapsed <- collapsed[, order(colSums(collapsed), decreasing = TRUE)]
  if (verbose) 
    message("Output ", ncol(collapsed), " collapsed sequences out of ", 
            ncol(seqtab), " input sequences.")
  collapsed
}

seqtab.final <- collapseNoMismatch_CV(seqtab.nochim, verbose=TRUE)
saveRDS(seqtab.final, file.path(outp,"seqtab.coll.RDS"),compress="gzip")

path.cut <- "MicroMic/R_code/Pipelines/output/Run2_16S/cutadapt_and_filtered"
filtFs <-sort(list.files(path.cut, pattern="*_fwd.fastq"))
sample.names <- gsub("_fwd.fastq","",filtFs)
rownames(seqtab.final) <- sample.names

saveRDS(seqtab.final, file.path(outp, "seqtab.coll.names.RDS"))
```

# Assign taxonomy
```{r Assign_taxonomy, eval=FALSE}
# We use RDP to assign ASV sequences a taxonomy using SILVA v138
bank.ge <- "MicroMic/R_code/Pipelines/bank_16S/silva_nr99_v138_train_set.fa.gz"
bank.sp <- "MicroMic/R_code/Pipelines/bank_16S/silva_species_assignment_v138.fa.gz"

tax_dada <- assignTaxonomy(seqtab.final, bank.ge,
                           multithread=16, tryRC = T, outputBootstraps = T,minBoot = 80)

apply(tax_dada$tax, 2, function(x) sum(is.na(x))) # Check number of unassigned ASV
saveRDS(tax_dada, file.path(outp, "taxa.genus.16S2.boot.assembly.RDS"))

# At the species rank
chunk.size <- 1000
tax_dada_sp <- do.call(rbind,
                       lapply(split(c(1:nrow(tax_dada$tax)), sort(c(1:nrow(tax_dada$tax))%%ceiling(nrow(tax_dada$tax)/chunk.size))),
                              function(x){
                                return(addSpecies(tax_dada$tax[x, ], bank.sp, tryRC = TRUE))
                              }))

saveRDS(tax_dada_sp, file.path(outp, "taxa.species.16S2.assembly.RDS"))
apply(tax_dada_sp, 2, function(x) sum(is.na(x)))

#Write fasta file
ASV.tab <- readRDS(file.path(outp, "seqtab.coll.names.RDS"))
dada2::uniquesToFasta(ASV.tab, "ASV.seq.16S2.fasta", ids=paste0("ASV", seq(length(dada2::getSequences(ASV.tab)))))

write.fasta.dada <-function(dada2, file){
  seqs <-dada2::getSequences(dada2)
  ASVID <-paste0(">","ASV", seq(length(dada2::getSequences(dada2))))
  write(c(rbind(ASVID, seqs)),file)
}
write.fasta.dada(ASV.tab,file.path(outp, "ASV.seq.16S2.fasta"))

# Rearrange taxa table 
tax <- readRDS(file.path(outp, "taxa.species.16S2.assembly.RDS"))

#Replace NA by unknown 
for (i in 2:ncol(tax)) {
  levels(tax[,i]) <- c(levels(tax[,i]), "Unknown")
  tax[which(is.na(tax[,i])==TRUE),i] <- "Unknown"
}

#Replace "-" by unknown 
for (i in 2:ncol(tax)) {
  levels(tax[,i]) <- c(levels(tax[,i]), "Unknown")
  tax[which(tax[,i]=="-"),i]  <- "Unknown"
}

#Replace NULL by unknown 
for (i in 2:ncol(tax)) {
  levels(tax[,i]) <- c(levels(tax[,i]), "Unknown")
  tax[which(tax[,i]=="NULL"),i]  <- "Unknown"
}

# In the taxonomy table, add the genus term to the species term in the species column  
taxa_all_sp <- cbind(tax, "Species" = paste(tax[,6],tax[,7]))
tax[,"Species"] <-  taxa_all_sp[,8]
```

# Create phyloseq object  
```{r phyloseq_object, eval=FALSE}
# Rearrange metadata 
smp <- as.data.frame(rownames(ASV.tab))
colnames(smp) <- "Smp.names"

smp$Sample_type <- NA
smp$Sample_type[grep("T", smp$Smp.names)] <- "NTC"
smp$Sample_type[!grepl("T", smp$Smp.names)] <- "Sample"
smp$Sample_type[grep("C", smp$Smp.names)] <- "NTC"
rownames(smp) <- smp$Smp.names
smp$Smp.names <- NULL

library(phyloseq)
ps.dada2 <- phyloseq(tax_table(tax) , 
                     otu_table(t(ASV.tab), taxa_are_rows = TRUE), 
                     sample_data(smp))
taxa_names(ps.dada2) <- paste0("ASV", seq(ntaxa(ps.dada2)))
saveRDS(ps.dada2, file.path(outp, "ps.dada2.RDS"))
```


2 - Remove non microbial taxa and contaminants 

# Not microbial taxa 
```{r not_microbial_taxa, eval=FALSE }
# Remove non bacterial ASV from phyloseq object.
ps.dada2 <- readRDS(file.path(outp, "ps.dada2.RDS"))
ps <- prune_taxa(grepl("Bacteria", tax_table(ps.dada2)[,"Kingdom"]),ps.dada2)
ps <- prune_taxa(!grepl("Chloroplast", tax_table(ps)[,"Order"]),ps)
saveRDS(ps, file.path(outp, "ps.microbial.RDS"))
```

# Apply Galan filters (positive controls)
```{r Galan_filters, eval=FALSE }
# Inspect frequency of marine  strains and remove them from the dataset 
library(psadd)
ASV_Tpos_PCR1 <- get_taxa_nomy(ps, "Tpos-a", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tpos_PCR2 <- get_taxa_nomy(ps, "Tpos_extra", level = c("Phylum","Class","Order","Family","Genus","Species"))
pos.TPCR <- rbind(ASV_Tpos_PCR1, ASV_Tpos_PCR2)

# Identify ASV that correspond to microbial strains added to the positive control
## Blast DNA sequences on NCBI and get the hit table 
hittab <- read.csv("MicroMic/R_code/Pipelines/output/Run2_16S/output/Hit_table_positive_controls.csv")
colnames(hittab)<- c("query","subject","identity","align_length","mismatch",
                     "gap_open","qstart","qend","sstart","send","evalue","bitscore")
hittab <- subset(hittab, identity >= 99)

#############################
#                           #
#   Get taxonomy on NCBI    #
#                           #
#############################
# Load function to get taxonomy 
# Get NCBI taxonomy from NCBI ID
# Charlie PAUVERT
# 23/04/2018
get_tax_ncbi<-function(ids){
  # Load the library
  library(rentrez)
  library(XML)
  library(plyr)
  
  # Fetch the queries from nuccore database.
  # Lightweight retrieval using gb but restricting sequence length to 1
  # Do not parse the result but instead use XML::xmlParse
  if(length(ids) > 15){
    message("More than 15 requests, using WebHistory")
    upload_ids <- entrez_post(db="nuccore", id=ids)
    queries<-entrez_fetch(db = "nuccore",
                          rettype = "gb", retmode="xml",seq_start=1, seq_stop=2,
                          web_history = upload_ids,
                          parsed = FALSE)
  } else{
    queries<-entrez_fetch(db = "nuccore", id = ids,
                          rettype = "gb", retmode="xml",seq_start=1, seq_stop=2,
                          parsed = FALSE)
  }
  
  
  # Parse the results using XML
  queries.parsed<-XML::xmlParse(file=queries,asText = TRUE)
  
  # For each nodes of interest (accession, organism etc.))
  # Fetch the value of the node in the parsed XML
  # Transpose for clarity: one ID for each line
  queries.df<-t(laply(
    c("accession-version","organism","taxonomy","definition"),
    function(node){
      XML::xpathSApply(queries.parsed,path = paste0("/GBSet/GBSeq/GBSeq_",node),XML::xmlValue)
    })
  )
  # Convert to data.frame for future processing
  queries.df<-as.data.frame(queries.df)
  # 
  colnames(queries.df)<-c("Accession","Organism","Taxonomy","Definition")
  
  return(queries.df)
}


# Get taxonomy 
tax <- list()
nASV <- nrow(hittab)
chunkSize <- 200
for(i in 1:ceiling(nASV/chunkSize)) {
  tax[[i]] <- get_tax_ncbi(hittab$subject[((i-1)*chunkSize+1):min(nASV,(i*chunkSize))])
}
taxdf <- ldply(tax, data.frame)

# Get ASVnames to the NCBI accession 
vibrio <- taxdf[grepl("Vibrio splendidus",taxdf$Definition),]
vibrio <- merge(hittab, vibrio, by.x = "subject", by.y = "Accession")
unique(vibrio$query)
# "ASV84" "ASV66" "ASV4"  "ASV92" "ASV34"

Sulfi <- taxdf[grepl("Sulfitobacter pontiacus",taxdf$Definition),]
Sulfi <- merge(hittab, Sulfi, by.x = "subject", by.y = "Accession")
unique(Sulfi$query)
# "ASV64"  "ASV306"

# Plot the abundance 
marine.strains <-prune_taxa(taxa_names(ps)%in% c("ASV64","ASV306","ASV4","ASV34","ASV66","ASV84","ASV92"), ps)
plot_bar(marine.strains, title = "Abundance of marine strains among sample types", y="Abundance", x = "Sample_type")

# ASV identifiers of the positive control ASV of the strain.
marine.strains<- c(rep('Sulfitobacter pontiacus',2),rep("Vibrio splendidus",5))
names(marine.strains)<-c("ASV64","ASV306","ASV4","ASV34","ASV66","ASV84","ASV92")


###############################
#                             #
#     Galan filters           #  
#                             #
###############################

# Filtering out contaminants with control samples (Galan et al. 2016)
# Charlie Pauvert
# 2018-09-21

viz_matrix<-function(m,axes=T, xlab=NULL, ylab=NULL,...){
  mviz<-t(m) > 0
  image( mviz[,rev(1:ncol(mviz))], xaxt= "n", yaxt= "n", col = c("white","black"),useRaster = T,...)
  if(axes){
    axis( 1, at=seq(0,1,length.out=ncol( m ) ), labels= colnames( m ), las= 2 )
    axis( 2, at=seq(0,1,length.out=nrow( m ) ), labels= rev(rownames( m )), las= 2)
  }
  if(!is.null(xlab)) mtext(xlab,side = 1)
  if(!is.null(ylab)) mtext(ylab,side = 2)
}


run_galan_TFA <-function(physeq=NULL, ctrl_smp=NULL, positive=NULL, positive_smp=NULL,plot=TRUE){
  if(any(is.null(physeq),is.null(ctrl_smp))){
    
    message("Threshold T_CC : filtering for cross-contamination")
    # Find the maximum number of sequences for an ASV in the
    # negative and positive controls samples
    if(!is.null(positive) & !is.null(positive_smp)){
      # Check if positives samples exists and ASV are given
      positive<-positive[positive %in% taxa_names(physeq)]
      # Fetch only ASV table of the control samples
      ctrl<-prune_samples(c(ctrl_smp,positive_smp),physeq)
      # # And removed the positive control ASVs
      # ctrl<-prune_taxa(!taxa_names(ctrl) %in% positive,ctrl)
    } else {
      # Fetch only negative control samples
      ctrl<-prune_samples(ctrl_smp,physeq)
    }
    m.ctrl<-as(otu_table( ctrl ), "matrix")
    
    
    # T_CC threshold for each ASV in the control samples
    mar<-ifelse(taxa_are_rows(physeq), 1 , 2)
    threshold.ctrl<-apply(m.ctrl,mar,max)
    
    
    if(!is.null(positive)){
      # Positive control ASVs should not be filtered 
      threshold.ctrl[ names(threshold.ctrl) %in% positive]<-0  
    }
    
    # Define a function to relevel ASV to 0 according to a ASV wise threshold.
    adaptative_threshold<-function(physeq, ASVThreshold){
      # Fetch ASV table
      ASV<-otu_table(physeq)
      # set margin depending on ASV orientation: row or column
      taxMargin<-ifelse( taxa_are_rows(ASV), 1, 2)
      # remove the number of sequences indicated in ASVThreshold
      # by samples for each ASV
      ASV.sweeped<-sweep(ASV, taxMargin, ASVThreshold+1, "-")
      # any ASV abundance negative was hence below the threshold
      # and therefore set to NA
      ASV.sweeped[ which(ASV.sweeped < 0) ]<-NA
      # Re balance the altered abundance by adding the previously
      # removed threshold. Note that NA are not affected
      ASV.sweeped<-sweep(ASV.sweeped, taxMargin, ASVThreshold+1,"+")
      # ASV that are NA are set to 0 and considered absent.
      ASV.sweeped[which(is.na(ASV.sweeped))]<-0
      # Return the corrected ASV table
      otu_table(physeq)<-otu_table(ASV.sweeped,
                                   taxa_are_rows = taxa_are_rows(ASV))
      return(physeq)
    }
    
    # Keep the initial ASV table
    initM<-as(otu_table( physeq ), "matrix")
    # Apply the threshold to the ASV table
    physeq<-adaptative_threshold(physeq, threshold.ctrl)
    # Compute the number of cells where the filter occurred
    finalM<-as(otu_table( physeq ), "matrix")
    # Wherever the value is negative, the filter occurred
    finalM<-finalM - initM < 0
    if(plot){
      # Plot the results
      (viz_matrix(finalM,axes=F,main="Threshold T_CC : filtering for cross-contamination"))
    }
    message(paste("Ranges of the T_CC treshold:", min(threshold.ctrl),max(threshold.ctrl)))
    
    if(!is.null(positive) & !is.null(positive_smp)){
      message("\n\nThreshold T_FA : filtering out incorrectly assigned sequences.")
      positive<-positive[positive %in% taxa_names(physeq)]
      # Identification of the maximal number of sequences of "alien" ASV
      # assigned to environmental samples
      alien.max<-max(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp])
      # Find which ASV had the maximum values (because the above drops names)
      alien.max.otu<-rownames(which(otu_table(physeq)[positive,! sample_names(physeq) %in% positive_smp] == alien.max,arr.ind = T))
      message(paste("Max alien sequence number:",alien.max))
      message(paste(" identified with the following ASV:",alien.max.otu))
      
      # False-assignment rate R_fa
      alien.ratio<-alien.max / sum(taxa_sums(physeq)[alien.max.otu])
      message(paste("Alien sequence ratio (R_fa):", scales::percent(alien.ratio)))
      
      # Number of sequences potentially misassigned to their samples for each ASV (
      # Threshold T_fa
      threshold.alien<-taxa_sums(physeq) * alien.ratio
      message(paste("Ranges of the T_FA treshold:", min(threshold.alien),round(max(threshold.alien))))
      
      initM<-as(otu_table( physeq ), "matrix")
      
      # Apply the T_fa threshold        
      physeq<-adaptative_threshold(physeq, threshold.alien)
      
      # Compute the number of cells where the filter occurred
      finalM<-as(otu_table( physeq ), "matrix")
      # Wherever the value is negative, the filter occurred
      finalM<-finalM - initM < 0
      if(plot){
        # Plot the results
        (viz_matrix(finalM,axes=F,main="Threshold T_FA : filtering out incorrectly assigned sequences."))
      }
    }
    message("\n\nThe removal of the control samples and the control ASV is NOT done within this function!")
    message("\n\nPlease cite Galan et al. (2016) doi:10.1128/mSystems.00032-16")
    return(physeq)
  }
}
# Applying Galan filters using two thresholds T_CC and T_FA
ps_galan <-run_galan_TFA(physeq = ps,
                     positive = names(marine.strains),
                     positive_smp = c("Tpos-a","Tpos_extra"))

# Threshold T_FA : filtering out incorrectly assigned sequences.
# Max alien sequence number: 32
#  identified with the following ASV: ASV64
# Alien sequence ratio (R_fa): 0%
# Ranges of the T_FA treshold: 0.00184618934979519 807


#Remove control samples
ps_galan <- prune_samples(!grepl("Tpos",sample_names(ps_galan)), ps_galan)

# Remove taxa from the positive control
# Prune positive & negative control 
allTaxa <- taxa_names(ps_galan)
allTaxa <- allTaxa[!(allTaxa %in% names(marine.strains))]
ps_galan <- prune_taxa(allTaxa, ps_galan)

#Save phyloseq
ps_galan <- prune_samples(sample_sums(ps_galan)>0, ps_galan)
ps_galan <- prune_taxa(taxa_sums(ps_galan)>0, ps_galan)
saveRDS(ps_galan, file.path(outp, "ps.galan.RDS"))
```

# Apply microDecon filters (negative controls)
```{r microDecon_filters, eval=FALSE}
# Inspect negative controls 
ASV_Tneg_PCR1 <- get_taxa_nomy(ps_galan, "T-PCRa", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_PCR2 <- get_taxa_nomy(ps_galan, "T-PCRb", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_PCR3 <- get_taxa_nomy(ps_galan, "T-PCRc", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_extra1 <- get_taxa_nomy(ps_galan, "T-extra", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_extra2 <- get_taxa_nomy(ps_galan, "T-extra_2", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_Sampling1 <- get_taxa_nomy(ps_galan, "C_Cry", level = c("Phylum","Class","Order","Family","Genus","Species"))
ASV_Tneg_Sampling2 <- get_taxa_nomy(ps_galan, "C_PBS", level = c("Phylum","Class","Order","Family","Genus","Species"))
# Look at taxa found in the negative control to remove taxa 
Tneg <- rbind(ASV_Tneg_PCR1, ASV_Tneg_PCR2, ASV_Tneg_PCR3,ASV_Tneg_extra1,ASV_Tneg_extra2, ASV_Tneg_Sampling1, ASV_Tneg_Sampling2)

# Remove contaminant using microDecon package 
library(microDecon)
library(DivComAnalyses)
# source("C:/Users/fortv/OneDrive/Documents/These/Fonctions_R/DivComAnalyses/phyloseq_normalisation.R")

# Add sample type according to the different experiments 
sample_data(ps_galan)$Experiments <- NA
sample_data(ps_galan)$Experiments[grep("D", sample_names(ps_galan))] <- "leaf_habitat_1"
sample_data(ps_galan)$Experiments[grep(paste(c("Ep","End"),collapse="|"), sample_names(ps_galan))] <- "leaf_habitat_2"
sample_data(ps_galan)$Experiments[grep(paste(c("Bm","Bry","Fra","Lie","Lit","Ram","Riv"),collapse="|"),
                                       sample_names(ps_galan))] <- "source"
sample_data(ps_galan)$Experiments[grep("J", sample_names(ps_galan))] <- "litter"

res.decon <- phyloseq_remove_contaminants_microDecon(ps_galan, 
                                                   sample_type_var ="Sample_type",
                                                   grouping_var = "Experiments",
                                                   NTC_label ="NTC",
                                                   taxa_level = "Species",
                                                   taxa_plot= "Species",
                                                   facet_plot = "Sample_type")

# Plot the abundance of contaminants identified 
res.decon$diagnostic.plot

# ASVs removed 
res.decon$result.microDecon$OTUs.removed

# Decontaminated phyloseq 
ps.decon <- res.decon$physeq.decontaminated
sample_data(ps.decon)$Experiments <- NULL
ps.decon <-prune_taxa(taxa_sums(ps.decon)>0, ps.decon)
saveRDS(ps.decon, file.path(outp, "ps.decon.RDS"))
```

# Remove rare taxa and samples with too few reads 
```{r rare_taxa_sample_reads}
# Remove rare taxa 
ps.decon <- readRDS(file.path(outp, "ps.decon.RDS"))
ps.singleton <- prune_taxa(taxa_sums(ps.decon)>1, ps.decon)
saveRDS(ps.singleton, file.path(outp, "ps.singleton.RDS"))

# Remove samples with too few reads 
ps.100reads <- prune_samples(sample_sums(ps.singleton)>100, ps.singleton)
ps.100reads <- prune_taxa(taxa_sums(ps.100reads)>0, ps.100reads)
saveRDS(ps.100reads, file.path(outp, "ps.100reads.RDS"))
```

3 - Add metadata 
```{r add_metadata, eval = FALSE}
metadata <- read.csv("MicroMic/R_code/add_metadata/metadata_Run2.csv", row.names = 1,  header = T, sep = ";")

# Remove duplicates in phyloseq object 
ps.smp <- prune_samples(sample_names(ps.1000reads) != "D1_H727_EH7_N-a", ps.1000reads)
ps.smp <- prune_samples(sample_names(ps.smp) != "D2_H727_EH7_I-a", ps.smp)

metadata <- subset(metadata, rownames(metadata) %in% sample_names(ps.smp))
sample_data(ps.smp) <- metadata

# Add sequencing depth before and after filtering reads
# After filter 
sample_data(ps.smp)$SD_aft_filter <- sample_sums(ps.smp) 
# Before filter 
ps.dada <- readRDS(file.path(outp, "ps.dada2.RDS"))
list.smp <- sample_names(ps.smp)
psno.filt <- subset_samples(ps.dada,  sample_names(ps.dada) %in% list.smp )
lb.size.ps <- sample_sums(psno.filt)
sample_data(ps.smp)$SD_bf_filter <- lb.size.ps
saveRDS(ps.smp, file.path(outp, "ps.vf.Run2.16S.RDS"))
```
